1. Basic Data inspection¶
In [14]:
# importing all the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.subplots as pp
import seaborn as sns
from wordcloud import WordCloud
from scipy.stats import chi2_contingency
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR, SVC
from sklearn.decomposition import PCA
from sklearn.multioutput import MultiOutputClassifier
import json
import pickle
import bz2file as bz2
Loading dataset¶
In [15]:
df = pd.read_csv("Airbnb_Data.csv")
pd.set_option('display.max_columns', 100) # To view all columns
df.head()
Out[15]:
| id | log_price | property_type | room_type | amenities | accommodates | bathrooms | bed_type | cancellation_policy | cleaning_fee | city | description | first_review | host_has_profile_pic | host_identity_verified | host_response_rate | host_since | instant_bookable | last_review | latitude | longitude | name | neighbourhood | number_of_reviews | review_scores_rating | thumbnail_url | zipcode | bedrooms | beds | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6901257 | 5.010635 | Apartment | Entire home/apt | {"Wireless Internet","Air conditioning",Kitche... | 3 | 1.0 | Real Bed | strict | True | NYC | Beautiful, sunlit brownstone 1-bedroom in the ... | 2016-06-18 | t | t | NaN | 2012-03-26 | f | 2016-07-18 | 40.696524 | -73.991617 | Beautiful brownstone 1-bedroom | Brooklyn Heights | 2 | 100.0 | https://a0.muscache.com/im/pictures/6d7cbbf7-c... | 11201 | 1.0 | 1.0 |
| 1 | 6304928 | 5.129899 | Apartment | Entire home/apt | {"Wireless Internet","Air conditioning",Kitche... | 7 | 1.0 | Real Bed | strict | True | NYC | Enjoy travelling during your stay in Manhattan... | 2017-08-05 | t | f | 100% | 2017-06-19 | t | 2017-09-23 | 40.766115 | -73.989040 | Superb 3BR Apt Located Near Times Square | Hell's Kitchen | 6 | 93.0 | https://a0.muscache.com/im/pictures/348a55fe-4... | 10019 | 3.0 | 3.0 |
| 2 | 7919400 | 4.976734 | Apartment | Entire home/apt | {TV,"Cable TV","Wireless Internet","Air condit... | 5 | 1.0 | Real Bed | moderate | True | NYC | The Oasis comes complete with a full backyard ... | 2017-04-30 | t | t | 100% | 2016-10-25 | t | 2017-09-14 | 40.808110 | -73.943756 | The Garden Oasis | Harlem | 10 | 92.0 | https://a0.muscache.com/im/pictures/6fae5362-9... | 10027 | 1.0 | 3.0 |
| 3 | 13418779 | 6.620073 | House | Entire home/apt | {TV,"Cable TV",Internet,"Wireless Internet",Ki... | 4 | 1.0 | Real Bed | flexible | True | SF | This light-filled home-away-from-home is super... | NaN | t | t | NaN | 2015-04-19 | f | NaN | 37.772004 | -122.431619 | Beautiful Flat in the Heart of SF! | Lower Haight | 0 | NaN | https://a0.muscache.com/im/pictures/72208dad-9... | 94117.0 | 2.0 | 2.0 |
| 4 | 3808709 | 4.744932 | Apartment | Entire home/apt | {TV,Internet,"Wireless Internet","Air conditio... | 2 | 1.0 | Real Bed | moderate | True | DC | Cool, cozy, and comfortable studio located in ... | 2015-05-12 | t | t | 100% | 2015-03-01 | t | 2017-01-22 | 38.925627 | -77.034596 | Great studio in midtown DC | Columbia Heights | 4 | 40.0 | NaN | 20009 | 0.0 | 1.0 |
Shape and details of dataset¶
In [16]:
print('Rows: ',df.shape[0])
print('Columns: ',df.shape[1])
Rows: 74111 Columns: 29
In [17]:
print('Dataframe details: \n')
df.info(verbose=True)
Dataframe details: <class 'pandas.core.frame.DataFrame'> RangeIndex: 74111 entries, 0 to 74110 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 74111 non-null int64 1 log_price 74111 non-null float64 2 property_type 74111 non-null object 3 room_type 74111 non-null object 4 amenities 74111 non-null object 5 accommodates 74111 non-null int64 6 bathrooms 73911 non-null float64 7 bed_type 74111 non-null object 8 cancellation_policy 74111 non-null object 9 cleaning_fee 74111 non-null bool 10 city 74111 non-null object 11 description 74111 non-null object 12 first_review 58247 non-null object 13 host_has_profile_pic 73923 non-null object 14 host_identity_verified 73923 non-null object 15 host_response_rate 55812 non-null object 16 host_since 73923 non-null object 17 instant_bookable 74111 non-null object 18 last_review 58284 non-null object 19 latitude 74111 non-null float64 20 longitude 74111 non-null float64 21 name 74111 non-null object 22 neighbourhood 67239 non-null object 23 number_of_reviews 74111 non-null int64 24 review_scores_rating 57389 non-null float64 25 thumbnail_url 65895 non-null object 26 zipcode 73145 non-null object 27 bedrooms 74020 non-null float64 28 beds 73980 non-null float64 dtypes: bool(1), float64(7), int64(3), object(18) memory usage: 15.9+ MB
In [18]:
df.columns
Out[18]:
Index(['id', 'log_price', 'property_type', 'room_type', 'amenities',
'accommodates', 'bathrooms', 'bed_type', 'cancellation_policy',
'cleaning_fee', 'city', 'description', 'first_review',
'host_has_profile_pic', 'host_identity_verified', 'host_response_rate',
'host_since', 'instant_bookable', 'last_review', 'latitude',
'longitude', 'name', 'neighbourhood', 'number_of_reviews',
'review_scores_rating', 'thumbnail_url', 'zipcode', 'bedrooms', 'beds'],
dtype='object')
Basic statistical summary of the numerical columns¶
In [19]:
df.describe()
Out[19]:
| id | log_price | accommodates | bathrooms | latitude | longitude | number_of_reviews | review_scores_rating | bedrooms | beds | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 7.411100e+04 | 74111.000000 | 74111.000000 | 73911.000000 | 74111.000000 | 74111.000000 | 74111.000000 | 57389.000000 | 74020.000000 | 73980.000000 |
| mean | 1.126662e+07 | 4.782069 | 3.155146 | 1.235263 | 38.445958 | -92.397525 | 20.900568 | 94.067365 | 1.265793 | 1.710868 |
| std | 6.081735e+06 | 0.717394 | 2.153589 | 0.582044 | 3.080167 | 21.705322 | 37.828641 | 7.836556 | 0.852143 | 1.254142 |
| min | 3.440000e+02 | 0.000000 | 1.000000 | 0.000000 | 33.338905 | -122.511500 | 0.000000 | 20.000000 | 0.000000 | 0.000000 |
| 25% | 6.261964e+06 | 4.317488 | 2.000000 | 1.000000 | 34.127908 | -118.342374 | 1.000000 | 92.000000 | 1.000000 | 1.000000 |
| 50% | 1.225415e+07 | 4.709530 | 2.000000 | 1.000000 | 40.662138 | -76.996965 | 6.000000 | 96.000000 | 1.000000 | 1.000000 |
| 75% | 1.640226e+07 | 5.220356 | 4.000000 | 1.000000 | 40.746096 | -73.954660 | 23.000000 | 100.000000 | 1.000000 | 2.000000 |
| max | 2.123090e+07 | 7.600402 | 16.000000 | 8.000000 | 42.390437 | -70.985047 | 605.000000 | 100.000000 | 10.000000 | 18.000000 |
Null Values¶
In [20]:
print('\nNull values in dataset:\n')
df.isnull().sum().sort_values(ascending=False)
Null values in dataset:
Out[20]:
host_response_rate 18299 review_scores_rating 16722 first_review 15864 last_review 15827 thumbnail_url 8216 neighbourhood 6872 zipcode 966 bathrooms 200 host_identity_verified 188 host_since 188 host_has_profile_pic 188 beds 131 bedrooms 91 description 0 name 0 property_type 0 room_type 0 amenities 0 number_of_reviews 0 accommodates 0 longitude 0 city 0 latitude 0 bed_type 0 instant_bookable 0 cancellation_policy 0 cleaning_fee 0 log_price 0 id 0 dtype: int64
2. Visualization¶
Converting boolean and log values¶
In [21]:
# Replacing columns with f/t and boolean with 0/1
df = df.infer_objects(copy=False).replace({'f': 0, 't': 1})
df = df.infer_objects(copy=False).replace({False: 0, True: 1})
df['price'] = round(np.exp(df['log_price']), 2)
C:\Users\robbi\AppData\Local\Temp\ipykernel_220\600328113.py:2: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
df = df.infer_objects(copy=False).replace({'f': 0, 't': 1})
C:\Users\robbi\AppData\Local\Temp\ipykernel_220\600328113.py:3: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
df = df.infer_objects(copy=False).replace({False: 0, True: 1})
Calculating price per person¶
In [22]:
df['price_per_person'] = df['price']/df['accommodates']
Histogram for all numerical data¶
In [23]:
# Plotting the distribution of numerical features
numerical_features = ['accommodates', 'bathrooms', 'beds', 'bedrooms', 'review_scores_rating', 'number_of_reviews', 'host_response_rate']
df[numerical_features].hist(figsize=(20,20));
Box plots¶
In [24]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df[numerical_features])
plt.xlabel('Features')
plt.ylabel('Values')
plt.title('Box Plot of Numerical Features')
plt.xticks(rotation=45)
plt.show()
Price Distribution Histogram¶
In [25]:
plt.figure(figsize=(10, 6))
plt.hist(df['price'], bins=30, color='skyblue', edgecolor='black', alpha=.8, label='Price')
plt.hist(df['price_per_person'], bins=30, color='orange', edgecolor='red', alpha=.5, label='Price Per Person')
plt.title('Price Distribution Histogram')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
# plt.savefig("price.png", dpi=1000)
plt.show()
Price by room_type¶
In [26]:
avg_price_by_room_type = df.groupby('room_type')['price'].mean().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
bars = avg_price_by_room_type.plot(kind='bar', color='lightgreen')
for i, v in enumerate(avg_price_by_room_type):
plt.text(i, v + 1, str(round(v, 2)), ha='center', va='bottom', fontsize=7)
plt.title('Average Price by Room Type')
plt.xlabel('Room Type')
plt.ylabel('Average Price')
plt.xticks(rotation=45)
plt.grid(axis='y')
# plt.savefig("roomType_price.png", dpi=1000)
plt.show()
Instant Bookable Property Percentage¶
In [27]:
plt.figure(figsize=(8, 6))
instant_bookable_counts = df['instant_bookable'].value_counts()
labels = np.where(instant_bookable_counts.index == 1, 'Yes', 'No')
instant_bookable_counts.plot(kind='pie', autopct='%1.1f%%', colors=['lightcoral', 'lightblue'], labels=labels)
plt.title('Instant Bookable Property Percentage')
plt.ylabel('')
plt.show()
Price Trends Over Time as 'first_review' is in datetime format¶
In [28]:
df['first_review'] = pd.to_datetime(df['first_review'])
df['first_review'].dt.year
Out[28]:
0 2016.0
1 2017.0
2 2017.0
3 NaN
4 2015.0
...
74106 NaN
74107 2016.0
74108 2015.0
74109 NaN
74110 2013.0
Name: first_review, Length: 74111, dtype: float64
In [29]:
df['year'] = df['first_review'].dt.year
avg_price_by_year = df.dropna(subset=['first_review']).groupby('year')['price'].mean()
avg_price_per_person_by_year = df.dropna(subset=['first_review']).groupby('year')['price_per_person'].mean()
plt.figure(figsize=(10, 6))
avg_price_by_year.plot(marker='o', color='orange', label='Total Price')
avg_price_per_person_by_year.plot(marker='o', color='green', label='Price per person')
plt.title('Average Price Trends Over Time')
plt.xlabel('Year')
plt.ylabel('Average Price')
plt.grid(True)
plt.xticks(avg_price_by_year.index)
plt.legend(loc="upper right")
# plt.savefig("price_trend.png", dpi=1000)
plt.show()
Average price by City¶
In [30]:
plt.figure(figsize=(10, 6))
avg_price_by_city = df.groupby('city')['price'].mean().sort_values(ascending=False)
bars = avg_price_by_city.plot(kind='bar', color='lightcoral')
for i, v in enumerate(avg_price_by_city):
plt.text(i, v + 1, str(round(v, 2)), ha='center', va='bottom', fontsize=7)
plt.title('Average Price by City')
plt.xlabel('City')
plt.ylabel('Average Price')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()
Distrubition of price on map¶
In [31]:
color_scale = [(0, 'green'), (1,'red')]
fig = px.scatter_mapbox(df,
lat="latitude",
lon="longitude",
hover_name="neighbourhood",
hover_data=["neighbourhood", "accommodates"],
color="price",
color_continuous_scale=color_scale,
# size="price",
zoom=3,
height=800,
width=800)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
Amenities¶
In [32]:
amenities_list = df['amenities'].str.replace('{', '').str.replace('}', ',').str.replace('"', '')
df['amenities_clean'] = amenities_list
amenities_list
Out[32]:
0 Wireless Internet,Air conditioning,Kitchen,Hea...
1 Wireless Internet,Air conditioning,Kitchen,Hea...
2 TV,Cable TV,Wireless Internet,Air conditioning...
3 TV,Cable TV,Internet,Wireless Internet,Kitchen...
4 TV,Internet,Wireless Internet,Air conditioning...
...
74106 ,
74107 TV,Cable TV,Internet,Wireless Internet,Kitchen...
74108 TV,Internet,Wireless Internet,Air conditioning...
74109 TV,Wireless Internet,Air conditioning,Kitchen,...
74110 TV,Internet,Wireless Internet,Kitchen,Free par...
Name: amenities, Length: 74111, dtype: object
In [33]:
# amenities_list = list(df.amenities)
amenities_list_all = " ".join(amenities_list)
amenities_list_all = [amenity.strip().lower()
.replace("(", "")
.replace(")", "")
.replace("_", " ")
.replace(" & ", " and ")
.replace("-", " ")
for amenity in amenities_list_all.split(',')
if 'translation missing: ' not in amenity]
amenities_set = set(amenities_list_all)
len(amenities_set), amenities_set
Out[33]:
(128,
{'',
'24 hour check in',
'accessible height bed',
'accessible height toilet',
'air conditioning',
'air purifier',
'baby bath',
'baby monitor',
'babysitter recommendations',
'bath towel',
'bathtub',
'bathtub with shower chair',
'bbq grill',
'beach essentials',
'beachfront',
'bed linens',
'body soap',
'breakfast',
'buzzer/wireless intercom',
'cable tv',
'carbon monoxide detector',
'cats',
'changing table',
'children’s books and toys',
'children’s dinnerware',
'cleaning before checkout',
'coffee maker',
'cooking basics',
'crib',
'disabled parking spot',
'dishes and silverware',
'dishwasher',
'dogs',
'doorman',
'doorman entry',
'dryer',
'elevator',
'elevator in building',
'essentials',
'ethernet connection',
'ev charger',
'extra pillows and blankets',
'family/kid friendly',
'fire extinguisher',
'fireplace guards',
'firm matress',
'firm mattress',
'first aid kit',
'fixed grab bars for shower and toilet',
'flat',
'flat smooth pathway to front door',
'free parking on premises',
'free parking on street',
'game console',
'garden or backyard',
'grab rails for shower and toilet',
'ground floor access',
'gym',
'hair dryer',
'hand or paper towel',
'hand soap',
'handheld shower head',
'hangers',
'heating',
'high chair',
'host greets you',
'hot tub',
'hot water',
'hot water kettle',
'indoor fireplace',
'internet',
'iron',
'keypad',
'kitchen',
'lake access',
'laptop friendly workspace',
'lock on bedroom door',
'lockbox',
'long term stays allowed',
'luggage dropoff allowed',
'microwave',
'other',
'other pets',
'outlet covers',
'oven',
'pack ’n play/travel crib',
'paid parking off premises',
'path to entrance lit at night',
'patio or balcony',
'pets allowed',
'pets live on this property',
'pocket wifi',
'pool',
'private bathroom',
'private entrance',
'private living room',
'refrigerator',
'roll in shower with chair',
'room darkening shades',
'safety card',
'self check in',
'shampoo',
'single level home',
'ski in/ski out',
'smart lock',
'smartlock',
'smoke detector',
'smoking allowed',
'smooth pathway to front door',
'stair gates',
'step free access',
'stove',
'suitable for events',
'table corner guards',
'toilet paper',
'tv',
'washer',
'washer / dryer',
'waterfront',
'well lit path to entrance',
'wheelchair accessible',
'wide clearance to bed',
'wide clearance to shower and toilet',
'wide doorway',
'wide entryway',
'wide hallway clearance',
'window guards',
'wireless internet'})
In [34]:
from collections import Counter
word_count_dict = Counter(amenities_list_all)
wordcloud = WordCloud(width=1000, height=500, background_color='white').generate_from_frequencies(word_count_dict)
plt.figure(figsize=(15, 8))
plt.imshow(wordcloud, interpolation='nearest')
plt.axis("off")
# plt.savefig("wordcloud.png", bbox_inches="tight")
plt.show()
3. Data Preprocessing¶
In [35]:
df['host_response_rate'] = df['host_response_rate'].str.replace('%', '').astype('float')
df.head()
Out[35]:
| id | log_price | property_type | room_type | amenities | accommodates | bathrooms | bed_type | cancellation_policy | cleaning_fee | city | description | first_review | host_has_profile_pic | host_identity_verified | host_response_rate | host_since | instant_bookable | last_review | latitude | longitude | name | neighbourhood | number_of_reviews | review_scores_rating | thumbnail_url | zipcode | bedrooms | beds | price | price_per_person | year | amenities_clean | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6901257 | 5.010635 | Apartment | Entire home/apt | {"Wireless Internet","Air conditioning",Kitche... | 3 | 1.0 | Real Bed | strict | 1 | NYC | Beautiful, sunlit brownstone 1-bedroom in the ... | 2016-06-18 | 1.0 | 1.0 | NaN | 2012-03-26 | 0 | 2016-07-18 | 40.696524 | -73.991617 | Beautiful brownstone 1-bedroom | Brooklyn Heights | 2 | 100.0 | https://a0.muscache.com/im/pictures/6d7cbbf7-c... | 11201 | 1.0 | 1.0 | 150.0 | 50.000000 | 2016.0 | Wireless Internet,Air conditioning,Kitchen,Hea... |
| 1 | 6304928 | 5.129899 | Apartment | Entire home/apt | {"Wireless Internet","Air conditioning",Kitche... | 7 | 1.0 | Real Bed | strict | 1 | NYC | Enjoy travelling during your stay in Manhattan... | 2017-08-05 | 1.0 | 0.0 | 100.0 | 2017-06-19 | 1 | 2017-09-23 | 40.766115 | -73.989040 | Superb 3BR Apt Located Near Times Square | Hell's Kitchen | 6 | 93.0 | https://a0.muscache.com/im/pictures/348a55fe-4... | 10019 | 3.0 | 3.0 | 169.0 | 24.142857 | 2017.0 | Wireless Internet,Air conditioning,Kitchen,Hea... |
| 2 | 7919400 | 4.976734 | Apartment | Entire home/apt | {TV,"Cable TV","Wireless Internet","Air condit... | 5 | 1.0 | Real Bed | moderate | 1 | NYC | The Oasis comes complete with a full backyard ... | 2017-04-30 | 1.0 | 1.0 | 100.0 | 2016-10-25 | 1 | 2017-09-14 | 40.808110 | -73.943756 | The Garden Oasis | Harlem | 10 | 92.0 | https://a0.muscache.com/im/pictures/6fae5362-9... | 10027 | 1.0 | 3.0 | 145.0 | 29.000000 | 2017.0 | TV,Cable TV,Wireless Internet,Air conditioning... |
| 3 | 13418779 | 6.620073 | House | Entire home/apt | {TV,"Cable TV",Internet,"Wireless Internet",Ki... | 4 | 1.0 | Real Bed | flexible | 1 | SF | This light-filled home-away-from-home is super... | NaT | 1.0 | 1.0 | NaN | 2015-04-19 | 0 | NaN | 37.772004 | -122.431619 | Beautiful Flat in the Heart of SF! | Lower Haight | 0 | NaN | https://a0.muscache.com/im/pictures/72208dad-9... | 94117.0 | 2.0 | 2.0 | 750.0 | 187.500000 | NaN | TV,Cable TV,Internet,Wireless Internet,Kitchen... |
| 4 | 3808709 | 4.744932 | Apartment | Entire home/apt | {TV,Internet,"Wireless Internet","Air conditio... | 2 | 1.0 | Real Bed | moderate | 1 | DC | Cool, cozy, and comfortable studio located in ... | 2015-05-12 | 1.0 | 1.0 | 100.0 | 2015-03-01 | 1 | 2017-01-22 | 38.925627 | -77.034596 | Great studio in midtown DC | Columbia Heights | 4 | 40.0 | NaN | 20009 | 0.0 | 1.0 | 115.0 | 57.500000 | 2015.0 | TV,Internet,Wireless Internet,Air conditioning... |
In [36]:
df['first_review'] = pd.to_datetime(df['first_review'])
df['last_review'] = pd.to_datetime(df['last_review'])
df['host_since'] = pd.to_datetime(df['host_since'])
missing_start = df[df['host_since'].isnull() & df['first_review'].notnull()]
df.loc[df['host_since'].isnull() & df['first_review'].notnull(), ['host_since']] = missing_start['first_review']
df[df['host_since'].isnull() & df['last_review'].notnull()]
df.loc[df['host_since'].isnull() & df['last_review'].notnull(), ['host_since']] = missing_start['last_review']
columns_to_handle = {
"host_response_rate": "median",
"review_scores_rating": "median",
"bathrooms": "mean",
"beds": "mean",
"bedrooms": "mean",
"first_review": "min",
"last_review": "max",
"thumbnail_url": "unknown",
"neighbourhood": "mode",
"zipcode": "mode",
"host_identity_verified": "mode",
"host_has_profile_pic": "mode",
"host_since": "min"
}
# Group data by 'accommodates' and calculate mean for each group
mean_bedrooms = df.groupby('accommodates')['bedrooms'].mean()
mean_bathrooms = df.groupby('accommodates')['bathrooms'].mean()
mean_beds = df.groupby('accommodates')['beds'].mean()
for column, strategy in columns_to_handle.items():
if strategy == "mean":
mean = df.groupby('accommodates')[column].mean()
df[column] = df[column].fillna(df['accommodates'].map(mean))
if strategy == "median":
df[column] = df[column].fillna(df[column].median())
elif strategy == "mode":
df[column] = df[column].fillna(df[column].mode()[0])
elif strategy == "min":
df[column] = df[column].fillna(df[column].min())
elif strategy == "max":
df[column] = df[column].fillna(df[column].max())
elif strategy == "unknown":
df[column] = df[column].fillna("unknown")
df.isnull().sum()
Out[36]:
id 0 log_price 0 property_type 0 room_type 0 amenities 0 accommodates 0 bathrooms 0 bed_type 0 cancellation_policy 0 cleaning_fee 0 city 0 description 0 first_review 0 host_has_profile_pic 0 host_identity_verified 0 host_response_rate 0 host_since 0 instant_bookable 0 last_review 0 latitude 0 longitude 0 name 0 neighbourhood 0 number_of_reviews 0 review_scores_rating 0 thumbnail_url 0 zipcode 0 bedrooms 0 beds 0 price 0 price_per_person 0 year 15864 amenities_clean 0 dtype: int64
In [37]:
df1 = df.drop(["id", "name", "description", "first_review", "host_since", "last_review", "neighbourhood", "thumbnail_url", "zipcode"], axis = 1)
ordinal_encoder = OrdinalEncoder(categories=[['Shared room', 'Private room', 'Entire home/apt'],
['Airbed', 'Couch', 'Pull-out Sofa', 'Futon', 'Real Bed'],
['flexible', 'moderate', 'strict', 'super_strict_30', 'super_strict_60']]
)
df1[['room_type', 'bed_type', 'cancellation_policy']] = ordinal_encoder.fit_transform(df[['room_type', 'bed_type', 'cancellation_policy']])
label_df = df1.copy().drop(['amenities_clean', 'amenities'], axis = 1)
categorical_col = []
for column in label_df.columns:
if label_df[column].dtypes != "float64" and label_df[column].dtypes != "int64":
categorical_col.append(column)
le = LabelEncoder()
for col in categorical_col:
label_df[col] = le.fit_transform(label_df[col])
correlation_matrix = label_df.corr()
plt.figure(figsize = (20,20))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f")
plt.show()
In [38]:
# Perform one-hot encoding for nominal columns
one_hot_encoder = OneHotEncoder(drop=None, sparse_output=False)
# one_hot_encoder.fit(df[['property_type', 'city']])
encoded_features = one_hot_encoder.fit_transform(df[['property_type', 'city']])
encoded_df = pd.DataFrame(encoded_features, columns=one_hot_encoder.get_feature_names_out(['property_type', 'city']))
encoded_df.head()
df1 = pd.concat([df1.drop(['property_type', 'city'], axis=1), encoded_df], axis=1)
one_hot_encoder.categories_
Out[38]:
[array(['Apartment', 'Bed & Breakfast', 'Boat', 'Boutique hotel',
'Bungalow', 'Cabin', 'Camper/RV', 'Casa particular', 'Castle',
'Cave', 'Chalet', 'Condominium', 'Dorm', 'Earth House',
'Guest suite', 'Guesthouse', 'Hostel', 'House', 'Hut', 'In-law',
'Island', 'Lighthouse', 'Loft', 'Other', 'Parking Space',
'Serviced apartment', 'Tent', 'Timeshare', 'Tipi', 'Townhouse',
'Train', 'Treehouse', 'Vacation home', 'Villa', 'Yurt'],
dtype=object),
array(['Boston', 'Chicago', 'DC', 'LA', 'NYC', 'SF'], dtype=object)]
In [39]:
correlation_with_target = correlation_matrix['log_price']
sorted_correlation = correlation_with_target.abs().sort_values(ascending=False)
sorted_correlation
Out[39]:
log_price 1.000000 price 0.840001 room_type 0.607125 accommodates 0.567574 price_per_person 0.516145 bedrooms 0.473084 beds 0.442123 bathrooms 0.355397 cancellation_policy 0.131869 cleaning_fee 0.111191 bed_type 0.088367 review_scores_rating 0.084180 year 0.080714 property_type 0.048741 longitude 0.047529 instant_bookable 0.044271 number_of_reviews 0.032470 city 0.030913 host_identity_verified 0.024014 host_has_profile_pic 0.013171 latitude 0.002193 host_response_rate 0.001423 Name: log_price, dtype: float64
In [40]:
categorical_features = ['property_type', 'room_type', 'bed_type', 'cancellation_policy', 'cleaning_fee', 'city',
'host_has_profile_pic', 'host_identity_verified', 'instant_bookable']
target_variable = 'log_price'
# Chi-square test for independence
print("Chi-square test for independence:")
for feature in categorical_features:
contingency_table = pd.crosstab(df[feature], df[target_variable])
chi2, p_value, _, _ = chi2_contingency(contingency_table)
if p_value < 0.05:
print(f"{feature}: Useful (p-value={p_value})")
else:
print(f"{feature}: Not Useful (p-value={p_value})")
Chi-square test for independence: property_type: Useful (p-value=0.0) room_type: Useful (p-value=0.0) bed_type: Not Useful (p-value=0.07518663315848466) cancellation_policy: Useful (p-value=0.0) cleaning_fee: Useful (p-value=0.0) city: Useful (p-value=0.0) host_has_profile_pic: Useful (p-value=1.331633061779495e-41) host_identity_verified: Useful (p-value=2.2559278540162352e-55) instant_bookable: Useful (p-value=1.3414375212905722e-40)
Scaling¶
Standard scaler¶
In [41]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[numerical_features])
df1[numerical_features] = scaled_features
plt.figure(figsize=(10, 6))
sns.boxplot(data=scaled_features)
plt.xlabel('Features')
plt.ylabel('Values')
plt.title('Box Plot of Numerical Features after scaling')
plt.xticks(rotation=45)
plt.show()
Model¶
In [42]:
p_type_count = df.groupby('property_type').id.count()
p_type_count.sort_values()
Out[42]:
property_type Parking Space 1 Lighthouse 1 Island 1 Casa particular 1 Train 2 Cave 2 Tipi 3 Earth House 4 Chalet 6 Treehouse 7 Hut 8 Yurt 9 Vacation home 11 Castle 13 Tent 18 Serviced apartment 21 Boat 65 Boutique hotel 69 Hostel 70 In-law 71 Cabin 72 Timeshare 77 Camper/RV 94 Guest suite 123 Dorm 142 Villa 179 Bungalow 366 Bed & Breakfast 462 Guesthouse 498 Other 607 Loft 1244 Townhouse 1692 Condominium 2658 House 16511 Apartment 49003 Name: id, dtype: int64
In [43]:
few_cats = p_type_count[p_type_count < 5].index.tolist()
cleaned_df = df.drop(df[df['property_type'].isin(few_cats)].index)
cleaned_df[cleaned_df['property_type'].isin(few_cats)]
Out[43]:
| id | log_price | property_type | room_type | amenities | accommodates | bathrooms | bed_type | cancellation_policy | cleaning_fee | city | description | first_review | host_has_profile_pic | host_identity_verified | host_response_rate | host_since | instant_bookable | last_review | latitude | longitude | name | neighbourhood | number_of_reviews | review_scores_rating | thumbnail_url | zipcode | bedrooms | beds | price | price_per_person | year | amenities_clean |
|---|
In [63]:
features = ['accommodates','bedrooms','beds','bathrooms',
'property_type', 'room_type', 'cancellation_policy', 'cleaning_fee', 'city',
'host_has_profile_pic', 'host_identity_verified', 'instant_bookable']
target = 'log_price'
ordinal_encoder = OrdinalEncoder(categories=[['Shared room', 'Private room', 'Entire home/apt'],
['flexible', 'moderate', 'strict', 'super_strict_30', 'super_strict_60']])
preprocessor = ColumnTransformer(
transformers=[
('encoder', OneHotEncoder(drop=None, sparse_output=False), ['property_type', 'city']),
('ord', ordinal_encoder, ['room_type', 'cancellation_policy']),
('scaler', StandardScaler(), ['accommodates','bedrooms','beds','bathrooms'])
],
remainder='passthrough')
linear_model = LinearRegression()
linear_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', linear_model)])
# Split data into train and test sets
X = cleaned_df[features]
y = cleaned_df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Fit the pipeline and evaluate
linear_pipeline.fit(X, y)
y_pred = linear_pipeline.predict(X_test)
mse_linear = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse_linear)
r2_linear = r2_score(y_test, y_pred)
print("R2 Score: ", r2_linear)
Mean Squared Error: 0.2260184139932397 R2 Score: 0.5554416423023205
In [64]:
indices = range(len(y_test))
plt.figure(figsize=(20, 6))
plt.plot(indices, y_test, label='Actual', color='blue')
plt.plot(indices, y_pred, label='Predicted', color='red')
plt.xlabel('Index')
plt.ylabel('Values')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.show()
In [65]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', rf_model)])
# Fit the pipeline and evaluate
rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse_rf)
r2_rf = r2_score(y_test, y_pred)
print("R2 Score: ", r2_rf)
Mean Squared Error: 0.22805464552677088 R2 Score: 0.5514365538210523
In [66]:
encoded_feature_names = rf_pipeline.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out(['property_type', 'city'])
other_features = ['accommodates', 'bedrooms', 'beds', 'bathrooms', 'room_type', 'cancellation_policy', 'cleaning_fee',
'host_has_profile_pic', 'host_identity_verified', 'instant_bookable']
feature_names = list(encoded_feature_names) + other_features
feature_importances = pd.Series(rf_pipeline.named_steps['regressor'].feature_importances_, index=feature_names)
sorted_feature_importances = feature_importances.sort_values()
plt.figure(figsize=(10,10))
sns.barplot(x=sorted_feature_importances.values, y=sorted_feature_importances.index, hue=sorted_feature_importances.index, orient="h", palette="Blues_d", legend=False)
plt.title("Top Feature Importances")
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.show()
In [67]:
svr_model = SVR(kernel='rbf')
svr_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', svr_model)])
# Fit the pipeline and evaluate
svr_pipeline.fit(X_train, y_train)
y_pred = svr_pipeline.predict(X_test)
mse_svr = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse_svr)
r2_svr = r2_score(y_test, y_pred)
print("R2 Score: ", r2_svr)
Mean Squared Error: 0.2062936099783641 R2 Score: 0.5942385983725634
RFE¶
In [68]:
rfe = RFE(estimator=linear_model, n_features_to_select=10)
rfe_pipeline = Pipeline(steps=[('preprocesser', preprocessor), ('rfe', rfe), ('model', linear_model)])
rfe_pipeline.fit(X, y)
Out[68]:
Pipeline(steps=[('preprocesser',
ColumnTransformer(remainder='passthrough',
transformers=[('encoder',
OneHotEncoder(sparse_output=False),
['property_type', 'city']),
('ord',
OrdinalEncoder(categories=[['Shared '
'room',
'Private '
'room',
'Entire '
'home/apt'],
['flexible',
'moderate',
'strict',
'super_strict_30',
'super_strict_60']]),
['room_type',
'cancellation_policy']),
('scaler', StandardScaler(),
['accommodates', 'bedrooms',
'beds', 'bathrooms'])])),
('rfe',
RFE(estimator=LinearRegression(), n_features_to_select=10)),
('model', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocesser',
ColumnTransformer(remainder='passthrough',
transformers=[('encoder',
OneHotEncoder(sparse_output=False),
['property_type', 'city']),
('ord',
OrdinalEncoder(categories=[['Shared '
'room',
'Private '
'room',
'Entire '
'home/apt'],
['flexible',
'moderate',
'strict',
'super_strict_30',
'super_strict_60']]),
['room_type',
'cancellation_policy']),
('scaler', StandardScaler(),
['accommodates', 'bedrooms',
'beds', 'bathrooms'])])),
('rfe',
RFE(estimator=LinearRegression(), n_features_to_select=10)),
('model', LinearRegression())])ColumnTransformer(remainder='passthrough',
transformers=[('encoder', OneHotEncoder(sparse_output=False),
['property_type', 'city']),
('ord',
OrdinalEncoder(categories=[['Shared room',
'Private room',
'Entire home/apt'],
['flexible',
'moderate',
'strict',
'super_strict_30',
'super_strict_60']]),
['room_type', 'cancellation_policy']),
('scaler', StandardScaler(),
['accommodates', 'bedrooms', 'beds',
'bathrooms'])])['property_type', 'city']
OneHotEncoder(sparse_output=False)
['room_type', 'cancellation_policy']
OrdinalEncoder(categories=[['Shared room', 'Private room', 'Entire home/apt'],
['flexible', 'moderate', 'strict', 'super_strict_30',
'super_strict_60']])['accommodates', 'bedrooms', 'beds', 'bathrooms']
StandardScaler()
['cleaning_fee', 'host_has_profile_pic', 'host_identity_verified', 'instant_bookable']
passthrough
RFE(estimator=LinearRegression(), n_features_to_select=10)
LinearRegression()
LinearRegression()
LinearRegression()
In [69]:
feature_rankings = rfe_pipeline.named_steps['rfe'].ranking_
plt.figure(figsize=(10, 6))
plt.title("RFE Feature Ranking")
plt.xlabel("Number of features selected")
plt.ylabel("Feature ranking")
plt.plot(range(1, len(feature_rankings) + 1), feature_rankings)
plt.show()
In [70]:
selected_features = [feature_names[i] for i, support in enumerate(rfe_pipeline.named_steps['rfe'].support_) if support]
print("Selected features:", selected_features)
Selected features: ['property_type_Apartment', 'property_type_Bed & Breakfast', 'property_type_Cabin', 'property_type_Camper/RV', 'property_type_Dorm', 'property_type_Guest suite', 'property_type_Guesthouse', 'property_type_Hostel', 'property_type_Hut', 'property_type_Tent']
Principal Componenet Analysis¶
In [71]:
pca_pipeline = Pipeline([('preprocessor', preprocessor), ('pca', PCA())])
pca_pipeline.fit_transform(X)
pipeline = Pipeline([
('pca', pca_pipeline),
('regressor', LinearRegression())
])
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
mse_cv = -cv_scores.mean()
plt.figure(figsize=(10, 6))
pca = pca_pipeline.named_steps['pca']
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_), marker='o', linestyle='-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Explained Variance Ratio by Number of Components')
plt.grid(True)
plt.show()
In [72]:
linear_pipeline_pca = Pipeline([
('pca', Pipeline([('preprocessor', preprocessor), ('pca', PCA(n_components=20))])),
('regressor', LinearRegression())
])
linear_pipeline_pca.fit(X_train, y_train)
y_pred = linear_pipeline_pca.predict(X_test)
mse_pca = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse_pca)
r2_pca = r2_score(y_test, y_pred)
print("R2 Score: ", r2_pca)
Mean Squared Error: 0.22747886811543103 R2 Score: 0.5525690573895108
In [73]:
svr_model_pca = SVR(kernel='rbf')
svr_pipeline_pca = Pipeline(steps=[('pca', Pipeline([('preprocessor', preprocessor), ('pca', PCA(n_components=20))])),
('regressor', svr_model)])
# Fit the pipeline and evaluate
svr_pipeline_pca.fit(X_train, y_train)
y_pred = svr_pipeline_pca.predict(X_test)
mse_svr_pca = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse_svr_pca)
r2_svr_pca = r2_score(y_test, y_pred)
print("R2 Score: ", r2_svr_pca)
Mean Squared Error: 0.2080072706676499 R2 Score: 0.5908679783942161
In [45]:
amenities_text = df['amenities_clean'].apply(lambda x: x.replace('translation missing: en.hosting_amenity_49', '')
.replace('translation missing: en.hosting_amenity_50', '')
.split(','))
amenities_text = amenities_text.apply(lambda x: list(filter(None, x)))
amenities_text
Out[45]:
0 [Wireless Internet, Air conditioning, Kitchen,...
1 [Wireless Internet, Air conditioning, Kitchen,...
2 [TV, Cable TV, Wireless Internet, Air conditio...
3 [TV, Cable TV, Internet, Wireless Internet, Ki...
4 [TV, Internet, Wireless Internet, Air conditio...
...
74106 []
74107 [TV, Cable TV, Internet, Wireless Internet, Ki...
74108 [TV, Internet, Wireless Internet, Air conditio...
74109 [TV, Wireless Internet, Air conditioning, Kitc...
74110 [TV, Internet, Wireless Internet, Kitchen, Fre...
Name: amenities_clean, Length: 74111, dtype: object
In [46]:
mlb = MultiLabelBinarizer()
amenities_encoded = mlb.fit_transform(amenities_text)
amenities_df = pd.DataFrame(amenities_encoded, columns=mlb.classes_)
amenities_df = cleaned_df[features + ['price']].join(amenities_df)
amenities_df.head()
Out[46]:
| accommodates | bedrooms | beds | bathrooms | property_type | room_type | cancellation_policy | cleaning_fee | city | host_has_profile_pic | host_identity_verified | instant_bookable | price | smooth pathway to front door | 24-hour check-in | Accessible-height bed | Accessible-height toilet | Air conditioning | Air purifier | BBQ grill | Baby bath | Baby monitor | Babysitter recommendations | Bath towel | Bathtub | Bathtub with shower chair | Beach essentials | Beachfront | Bed linens | Body soap | Breakfast | Buzzer/wireless intercom | Cable TV | Carbon monoxide detector | Cat(s) | Changing table | Children’s books and toys | Children’s dinnerware | Cleaning before checkout | Coffee maker | Cooking basics | Crib | Disabled parking spot | Dishes and silverware | Dishwasher | Dog(s) | Doorman | Doorman Entry | Dryer | EV charger | ... | Long term stays allowed | Luggage dropoff allowed | Microwave | Other | Other pet(s) | Outlet covers | Oven | Pack ’n Play/travel crib | Paid parking off premises | Path to entrance lit at night | Patio or balcony | Pets allowed | Pets live on this property | Pocket wifi | Pool | Private bathroom | Private entrance | Private living room | Refrigerator | Roll-in shower with chair | Room-darkening shades | Safety card | Self Check-In | Shampoo | Single level home | Ski in/Ski out | Smart lock | Smartlock | Smoke detector | Smoking allowed | Stair gates | Step-free access | Stove | Suitable for events | TV | Table corner guards | Toilet paper | Washer | Washer / Dryer | Waterfront | Well-lit path to entrance | Wheelchair accessible | Wide clearance to bed | Wide clearance to shower & toilet | Wide clearance to shower and toilet | Wide doorway | Wide entryway | Wide hallway clearance | Window guards | Wireless Internet | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3 | 1.0 | 1.0 | 1.0 | Apartment | Entire home/apt | strict | 1 | NYC | 1.0 | 1.0 | 0 | 150.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | 7 | 3.0 | 3.0 | 1.0 | Apartment | Entire home/apt | strict | 1 | NYC | 1.0 | 0.0 | 1 | 169.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2 | 5 | 1.0 | 3.0 | 1.0 | Apartment | Entire home/apt | moderate | 1 | NYC | 1.0 | 1.0 | 1 | 145.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3 | 4 | 2.0 | 2.0 | 1.0 | House | Entire home/apt | flexible | 1 | SF | 1.0 | 1.0 | 0 | 750.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 4 | 2 | 0.0 | 1.0 | 1.0 | Apartment | Entire home/apt | moderate | 1 | DC | 1.0 | 1.0 | 1 | 115.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
5 rows × 141 columns
In [51]:
from xgboost import XGBClassifier
In [52]:
X = amenities_df.drop(list(mlb.classes_), axis=1)
y = amenities_df[list(mlb.classes_)]
# y_flattened = y.values.argmax(axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
preprocessor = ColumnTransformer(
transformers=[
('encoder', OneHotEncoder(drop=None, sparse_output=False), ['property_type', 'city']),
('ord', ordinal_encoder, ['room_type', 'cancellation_policy']),
('scaler', StandardScaler(), ['accommodates','bedrooms','beds','bathrooms'])
],
remainder='passthrough')
multi_output_classifier = MultiOutputClassifier(XGBClassifier(objective='binary:logistic'))
rf_pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', multi_output_classifier)
])
rf_pipeline.fit(X_train, y_train)
predicted_amenities = rf_pipeline.predict(X_test)
predicted_amenities_labels = mlb.inverse_transform(predicted_amenities)
predicted_amenities_labels[:5]
Out[52]:
[('24-hour check-in',
'Air conditioning',
'Buzzer/wireless intercom',
'Cable TV',
'Carbon monoxide detector',
'Dryer',
'Essentials',
'Family/kid friendly',
'Fire extinguisher',
'First aid kit',
'Free parking on premises',
'Hair dryer',
'Hangers',
'Heating',
'Internet',
'Iron',
'Kitchen',
'Laptop friendly workspace',
'Self Check-In',
'Shampoo',
'Smoke detector',
'TV',
'Washer',
'Wireless Internet'),
('Cable TV',
'Carbon monoxide detector',
'Dryer',
'Essentials',
'Fire extinguisher',
'Hair dryer',
'Hangers',
'Heating',
'Internet',
'Kitchen',
'Laptop friendly workspace',
'Shampoo',
'Smoke detector',
'TV',
'Washer',
'Wireless Internet'),
('Air conditioning',
'Carbon monoxide detector',
'Essentials',
'Hair dryer',
'Hangers',
'Heating',
'Internet',
'Iron',
'Kitchen',
'Laptop friendly workspace',
'Shampoo',
'Smoke detector',
'TV',
'Wireless Internet'),
('Air conditioning',
'Essentials',
'Hangers',
'Heating',
'Kitchen',
'Smoke detector',
'TV',
'Wireless Internet'),
('Carbon monoxide detector',
'Dryer',
'Essentials',
'Fire extinguisher',
'Hair dryer',
'Heating',
'Internet',
'Kitchen',
'Laptop friendly workspace',
'Shampoo',
'Smoke detector',
'TV',
'Washer',
'Wireless Internet')]
In [53]:
average_f1_score = f1_score(y_test, predicted_amenities, average='micro')
print(f"Average F1 Score: {average_f1_score}")
Average F1 Score: 0.7217858015168374
In [59]:
pca_pipeline = Pipeline([('preprocessor', preprocessor), ('pca', PCA())])
pca_pipeline.fit_transform(X)
pipeline = Pipeline([
('pca', pca_pipeline),
('classifier', multi_output_classifier)
])
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
mse_cv = -cv_scores.mean()
plt.figure(figsize=(10, 6))
pca = pca_pipeline.named_steps['pca']
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_), marker='o', linestyle='-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Explained Variance Ratio by Number of Components')
plt.grid(True)
plt.show()
In [54]:
rf_pipeline_pca = Pipeline([
('pca', Pipeline([('preprocessor', preprocessor), ('pca', PCA(n_components=20))])),
('classifier', multi_output_classifier)
])
rf_pipeline_pca.fit(X_train, y_train)
Out[54]:
Pipeline(steps=[('pca',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('encoder',
OneHotEncoder(sparse_output=False),
['property_type',
'city']),
('ord',
OrdinalEncoder(categories=[['Shared '
'room',
'Private '
'room',
'Entire '
'home/apt'],
['flexible',
'moderate',
'strict',
'super_strict_30',
'super_strict_60']]),
['room_typ...
grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=None,
max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None,
max_depth=None,
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=None,
n_jobs=None,
num_parallel_tree=None,
random_state=None, ...)))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('pca',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('encoder',
OneHotEncoder(sparse_output=False),
['property_type',
'city']),
('ord',
OrdinalEncoder(categories=[['Shared '
'room',
'Private '
'room',
'Entire '
'home/apt'],
['flexible',
'moderate',
'strict',
'super_strict_30',
'super_strict_60']]),
['room_typ...
grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=None,
max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None,
max_depth=None,
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=None,
n_jobs=None,
num_parallel_tree=None,
random_state=None, ...)))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('encoder',
OneHotEncoder(sparse_output=False),
['property_type', 'city']),
('ord',
OrdinalEncoder(categories=[['Shared '
'room',
'Private '
'room',
'Entire '
'home/apt'],
['flexible',
'moderate',
'strict',
'super_strict_30',
'super_strict_60']]),
['room_type',
'cancellation_policy']),
('scaler', StandardScaler(),
['accommodates', 'bedrooms',
'beds', 'bathrooms'])])),
('pca', PCA(n_components=20))])ColumnTransformer(remainder='passthrough',
transformers=[('encoder', OneHotEncoder(sparse_output=False),
['property_type', 'city']),
('ord',
OrdinalEncoder(categories=[['Shared room',
'Private room',
'Entire home/apt'],
['flexible',
'moderate',
'strict',
'super_strict_30',
'super_strict_60']]),
['room_type', 'cancellation_policy']),
('scaler', StandardScaler(),
['accommodates', 'bedrooms', 'beds',
'bathrooms'])])['property_type', 'city']
OneHotEncoder(sparse_output=False)
['room_type', 'cancellation_policy']
OrdinalEncoder(categories=[['Shared room', 'Private room', 'Entire home/apt'],
['flexible', 'moderate', 'strict', 'super_strict_30',
'super_strict_60']])['accommodates', 'bedrooms', 'beds', 'bathrooms']
StandardScaler()
['cleaning_fee', 'host_has_profile_pic', 'host_identity_verified', 'instant_bookable', 'price']
passthrough
PCA(n_components=20)
MultiOutputClassifier(estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
device=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
feature_types=None, gamma=None,
grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=None, max_bin=None,
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None,
max_depth=None, max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=None, n_jobs=None,
num_parallel_tree=None,
random_state=None, ...))XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)In [74]:
cat_cols = ['property_type','room_type','bed_type','cancellation_policy','city']
data_json = {}
for col in cat_cols:
values = cleaned_df[col].unique()
data_json[col] = values.tolist()
with open('obj/cat_data.json', 'w', encoding='utf-8') as f:
json.dump(data_json, f, ensure_ascii=False, indent=4)
with open('obj/price_predictor.pkl', 'wb') as file:
pickle.dump(svr_pipeline_pca, file)
with open('obj/label_binarizer.pkl', 'wb') as file:
pickle.dump(mlb, file)
with open('obj/amenities_predictor.pkl', 'wb') as f:
pickle.dump(rf_pipeline_pca, f)
In [58]:
# import shutil
# shutil.copytree('obj/', 'bin/')